CHURN PREDICTION CASE STUDY
Data Manipulation 2.1 Replace empty spaces with NaN in "total charges" column 2.2 Drop null values from "total charges" column which contains .15% missing data 2.3 Convert "total charges" to float. 2.4 Replace "No Internet Service" to No or folowwing columns
"Online Security","Online Backup","Device Protection","TechSupport","Streaming TV","Streaming Movies"
2.5 Replace "Senior Citizen" values {1:Yes,0:No} 2.6 Create a function "def Tenure_lab" to convert tenure to categories as follows.
<= 12 -> "Tenure_0-12"
> 12, <24 -> "Tenure_12-24"
> 24 , <=48 -> "Tenure_24-48"
>48, <=60 -> "Tenure_48-60"
> 60 -> "Tenure_gt_60"
def Tenure_lab(tenure): -- pass the dataframe columns
if df['tenure'].value <=20:
return "Tenure_0-12" df['tenure'].value = "Tenure_0-12"
elseif tenure -gt 12 && tenure -lt 24:
return "Tenure_12-24"
elseif tenure -gt 24 && tenure <= 48:
return "Tenure_24-48"
elseif tenure -gt 48 && tenure <= 60:
return "Tenure_48-60"
else:
return "Tenure_gt-60"
2.7 Seperate churn and not churn customers into variables churn, not_churn. 2.8 Seperate categorical and numerical columns into variables cat_cols,cat_num.
Description : Using all categorical columns, contruct a pie chart for churn and non churn customers
3.2 def histogram(column)
Description : Using all numerical columns, make histogram to compare churn and non churn customers.
3.3 Create a function called "def scatter_matrix(df)"
Description : For all numeric columns construct scatterplot which can visualize churn and non churn customer in one graph.
3.4 For all different trnure groups build a barplot to visualize churn and non churn customers.
3.5 Create a scatter plot betwen "Total Charges" and "Monthly Charges" and color code the data points using tenure groups.
3.6 Repeat above, color code to churn and not churn.
3.7 Construct barplot for Total Charges and Monthly Charges seperately with tenure group on x-axis and bar should be depicting churn and not churn for each customer tenure group.
3.8 Construct a correlation heatmap for all numerical variables.MAIL TO : loveesh.imarticus@gmail.com SUBJECT LINE : FirstName_MLP3 FILES : "html","ipynb"
# Import all the libraries required
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,precision_score,recall_score, roc_curve, roc_auc_score, confusion_matrix,auc
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from yellowbrick.classifier import DiscriminationThreshold
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.offline as py
import warnings
warnings.filterwarnings('ignore')
# Setting for full column selection
pd.set_option('display.max_columns', 999)
# Read the " churn" dataset
df_churn = pd.read_csv('/Users/abhisheksingh/Desktop/MLP/churn.csv', na_filter=False)
# Display top 5 records
df_churn.head()
# Print the number of rows and columns
print('Rows : ', df_churn.shape[0])
print('Columns : ', df_churn.shape[1])
# Print all the features as a list
df_churn.columns.to_list()
# Replace empty spaces with Nan
df = df_churn.replace(' ',np.NaN)
# Getting the null values in a variable
missing_data = df.isnull()
# Printing the value counts for all the columns
for column in missing_data.columns.values.tolist(): # Convert into list to iterate through columns
print(column)
print(missing_data[column].value_counts()) # Count True and False column wise
print("")
# Describe the data which includes Unique values for each feature(Categorical)
print ("\nUnique values : \n",df.nunique())
# Drop entire ros with null values
df.dropna(axis=0, inplace=True)
# Check the missing value counts
df.isnull().sum()
# Convert the datatype of "TotalCharges" column to "float".
df['TotalCharges'] = df['TotalCharges'].astype('float32')
# Check the datatype of the Column "TotalCharges"
df['TotalCharges'].dtype
#Replace "No Internet Service" to No or following columns
# "Online Security","Online Backup","Device Protection","TechSupport","Streaming TV","Streaming Movies"
cols = ['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
df[cols]=df[cols].replace('No internet service','No')
df = df.reset_index()[df.columns]
# Double check for one column
df['OnlineSecurity'].value_counts()
# Replace "Senior Citizen" values {1:Yes,0:No}
df['SeniorCitizen'] = df['SeniorCitizen'].replace({1:'Yes',0:'No'})
# Check the "Senior Citizen" column values
df['SeniorCitizen'].value_counts()
# Create a function "def Tenure_lab" to convert tenure to categories.
def Tenure_lab(df) :
if df['tenure'] <= 12 :
return "Tenure_0-12"
elif (df['tenure'] > 12) & (df['tenure'] <= 24 ):
return "Tenure_12-24"
elif (df['tenure'] > 24) & (df['tenure'] <= 48):
return "Tenure_24-48"
elif (df['tenure'] > 48) & (df['tenure'] <= 60):
return "Tenure_48-60"
elif df['tenure'] > 60:
return "Tenure_gt-60"
# Call the above function
df['tenure_cat'] = df.apply(lambda df:Tenure_lab(df),axis = 1)
# Check the unique values in the newly created columns
df['tenure_cat'].unique()
# Seperate churn and not churn customers into variables churn, not_churn
churn = df[df['Churn'] == "Yes"]
not_churn = df[df['Churn'] == "No"]
churn.head()
not_churn.head()
df.info()
# Seperate categorical and numerical columns into variables at_cols,cat_num
Id_col = 'customerID'
target_col = 'Churn'
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols = [cols for cols in cat_cols if cols not in target_col]
num_cols = df.select_dtypes(exclude=['object']).columns.tolist()
cat_cols
num_cols
# Exploratory Data Analysis
#Contruct the following functions:
# def plot_pie(column)
#Description : Using all categorical columns, contruct a pie chart for churn and non churn customers
#?go.Layout
#?go.Pie
def plot_pie(column):
Pie1=go.Pie(labels=churn[column].value_counts().keys().tolist(),
values=churn[column].value_counts().values.tolist(),
name = "Churn Customers",
domain = dict(x = [0,.48]),
marker = dict(line = dict(width = 2,
color = "rgb(243,243,243)")
),hole=.4)
Pie2=go.Pie(labels=not_churn[column].value_counts().keys().tolist(),
values=not_churn[column].value_counts().values.tolist(),
name = "Not Churn Customers",
domain = dict(x = [.52,1]),
marker = dict(line = dict(width = 2,
color = "rgb(243,243,243)")
),hole=.4)
layout = go.Layout(dict(title = column,
plot_bgcolor = "rgb(200,200,200)",
paper_bgcolor = "rgb(250,250,250)",
annotations = [dict(text = "churn customers",
font = dict(size = 10),
showarrow = False,
x = .18, y = .5),
dict(text = "Non churn customers",
font = dict(size = 10),
showarrow = False,
x = .83,y = .5
)
]
)
)
fig = go.Figure(data = [Pie1,Pie2], layout = layout)
py.iplot(fig)
for i in [x for x in cat_cols if x!='customerID']:
plot_pie(i)
#?go.Histogram
# def histogram(column)
# Description : Using all numerical columns, make histogram to compare churn and non churn customers.
def histogram(column) :
hist1 = go.Histogram(x = churn[column],
histnorm= "percent",
name = "Churn Customers",
opacity = .8
)
hist2 = go.Histogram(x = not_churn[column],
histnorm= "percent",
name = "Non Churn Customers",
opacity = .8
)
layout = go.Layout(dict(title =column,
plot_bgcolor = "rgb(200,200,200)",
paper_bgcolor = "rgb(250,250,250)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = column,
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "percent",
),
)
)
fig = go.Figure(data=[hist1,hist2],layout=layout)
py.iplot(fig)
for i in num_cols:
histogram(i)
# Create a function called "def scatter_matrix(df)"
# Description : For all numeric columns construct scatterplot which can visualize churn and
# non churn customer in one graph.
pd.plotting.scatter_matrix(df[num_cols], alpha=0.2, figsize=(10, 10), diagonal='kde')
plt.show()
sns.pairplot(df[num_cols])
plt.show()
# For all different tenure groups build a barplot to visualize churn and non churn customers.
bar_ch = churn['tenure_cat'].value_counts().reset_index()
bar_ch.columns = ['tenure_cat','count']
bar_nt_ch = not_churn['tenure_cat'].value_counts().reset_index()
bar_nt_ch.columns = ['tenure_cat','count']
bar_ch
bar_ch.columns
#bar - churn
bar1 = go.Bar(x = bar_ch['tenure_cat'] , y = bar_ch['count'],
name = 'Churn Customers',
opacity = .9)
#bar - not churn
bar2 = go.Bar(x = bar_nt_ch['tenure_cat'] , y = bar_nt_ch['count'],
name = 'Non Churn Customers',
opacity = .9)
layout = go.Layout(dict(title = "Churn in tenure groups",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = 'Tenure Category'),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = 'count'),
)
)
fig = go.Figure(data=[bar1,bar2],layout=layout)
py.iplot(fig)
#?sns.scatterplot
#ax = sns.scatterplot(x=churn['tenure_cat'], y=df[num_cols])
# Create a scatter plot betwen "Total Charges" and "Monthly Charges" and color code
# the data points using tenure categories.
df[['MonthlyCharges', 'TotalCharges','tenure',"tenure_cat"]]
# Scatter plot monthly charges & total charges by tenure categories
scatter1 = []
scatter_plot = {'Tenure_0-12' : 'red', 'Tenure_12-24': 'green','Tenure_24-48':'purple',
'Tenure_48-60':'yellow','Tenure_gt-60':'black'}
for keys,values in scatter_plot.items():
scatter1.append(go.Scatter(x = df[df['tenure_cat'] == keys]['MonthlyCharges'],
y = df[df['tenure_cat'] == keys]['TotalCharges'],
mode = "markers",marker = dict(line = dict(color = "black",
width = .2),
size = 4 , color = values,
symbol = "diamond-dot",
),
name = keys
,
opacity = .9
))
#layout
def layout_title(title) :
layout = go.Layout(dict(title = title,
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "Monthly charges",
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "Total Charges",
),
height = 600
)
)
return layout
layout1 = layout_title("Monthly Charges & Total Charges by Tenure Categories")
fig1 = go.Figure(data = scatter1,layout = layout1)
py.iplot(fig1)
df
# Repeat above, color code to churn and not churn.
churn1=[]
churn_plot={'Yes':'red', 'No':'blue'}
for keys,values in churn_plot.items():
churn1.append(go.Scatter(x = df[df['Churn'] == keys]['MonthlyCharges'],
y = df[df['Churn'] == keys]['TotalCharges'],
mode = 'markers',marker = dict(line = dict(color = 'black',
width = .2),
size = 4 , color = values,
symbol = 'diamond-dot',
),
name = 'Churn - ' + keys,
opacity = .9
))
layout2 = layout_title("Monthly Charges & Total Charges by Churn and not churn")
fig2 = go.Figure(data = churn1,layout = layout2)
py.iplot(fig2)
# Construct barplot for Total Charges and Monthly Charges seperately with tenure group
# on x-axis and bar should be depicting churn and not churn for each customer tenure group.
tenure_category = df[['tenure_cat','Churn','MonthlyCharges','TotalCharges']]
tenure_category[tenure_category['Churn']=='Yes']['tenure_cat']
tenure_category = df[['tenure_cat','Churn','MonthlyCharges','TotalCharges']]
churn_cat = {'Yes':'red','No':'blue'}
column = df[['MonthlyCharges','TotalCharges']].columns.to_list()
barplot1_M = []
barplot1_T = []
for i,j in churn_cat.items():
barplot1_M.append(go.Bar(x = tenure_category.loc[tenure_category['Churn'] == i]['tenure_cat'],
y = tenure_category.loc[tenure_category['Churn'] == i][column[0]],
name=i, marker = dict(line = dict(width = 1,color=j))))
barplot1_T.append(go.Bar(x = tenure_category.loc[tenure_category['Churn'] == i]['tenure_cat'],
y = tenure_category.loc[tenure_category['Churn'] == i][column[1]],
name=i,marker = dict(line = dict(width = 1,color=j))
))
#function for layout
def layout_plot(title,xaxis_lab,yaxis_lab) :
layout = go.Layout(dict(title = title,
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = xaxis_lab,
zerolinewidth=1,ticklen=5,gridwidth=2),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = yaxis_lab,
zerolinewidth=1,ticklen=5,gridwidth=2),
)
)
return layout
layout1 = layout_plot('Monthly Charges by Tenure Category',
'Tenure Category','Monthly Charges')
fig1 = go.Figure(data=barplot1_M,layout=layout1)
layout2 = layout_plot('Total Charges by Tenure Category',
'Tenure Category','Total Charges')
fig2 = go.Figure(data=barplot1_T,layout=layout2)
py.iplot(fig1)
py.iplot(fig2)
cols = df[num_cols].corr().columns.to_list()
cols
# Construct a correlation heatmap for all numerical variables
cols = df[num_cols].corr().columns.to_list()
corr_array = np.array(df[num_cols].corr())
#Plotting
heat_map = go.Heatmap(z = corr_array,
x = cols,
y = cols,
colorscale = "Viridis",
colorbar = dict(title = "Correlation",
titleside = "right"
) ,
)
layout = go.Layout(dict(title = "Correlation Matrix for Numeric variables",
autosize = False,
height = 720,
width = 800
)
)
fig = go.Figure(data=heat_map,layout=layout)
py.iplot(fig)
# Data Preprocessing
# Seperate CustomerID column in ID_col
Id_col
# Seperate churn in target_col
target_col
# Seperate categorical columns in cat_cols
cat_cols
# Seperate numerical columns in num_cols
num_cols
df.nunique()
# Seperate binary columns with 2 values & store in bin_cols
#df.nunique()
bin_cols = df.nunique()[df.nunique()==2].keys().to_list()
# Seperate variables with multiple values and store in multi_cols
col_req = [x for x in df.columns if x!='customerID']
multi_cols = df[col_req].nunique()[df[col_req].nunique()>2].keys().to_list()
multi_cols
df[multi_cols]
# Label encode binary columns using LabelEncoder()
bin_col1 = df[bin_cols].apply(LabelEncoder().fit_transform)
bin_col1
# Dummy encode multi_cols using pandas
df_1 = pd.get_dummies(df[multi_cols])
bin_col1
df_1
df_1 = pd.concat([df_1, bin_col1], axis=1)
df_1
# Scale numeric columns using StandardScalar()
scalar = StandardScaler().fit_transform(df[num_cols])
scalar
df_num = df[df.select_dtypes(exclude=['object']).keys()]
df_num.columns
df_num = pd.DataFrame(StandardScaler().fit_transform(df_num))
#df = pd.DataFrame(scale.fit_transform(df.values), columns=df.columns, index=df.index)
df_num.rename(columns={0:'tenure_1',1:'MonthlyCharges_1',2:'TotalCharges_1'}, inplace=True)
#df.rename(columns={"A": "a", "B": "c"})
df_num
# Drop Original values and Merge the scaled values with your dataframe.
df_final = pd.concat([df_1,df_num],axis=1)
df_final
df_final.drop(['tenure','MonthlyCharges','TotalCharges'], axis=1, inplace=True)
df_final
# Model Building
train,test = train_test_split(df_final,test_size = .25 ,random_state = 111)
cols = [i for i in df_final.columns if i not in Id_col + target_col]
X_train = train[cols]
y_train = train[target_col]
X_test = test[cols]
y_test = test[target_col]
def churn_prediction(algorithm,x_train,x_test,y_train,y_test,cols,threshold_plot) :
# Model
algorithm.fit(x_train,y_train)
# Prediction
predictions = algorithm.predict(x_test)
# Confusion matrix
confusion_mat = confusion_matrix(y_test,predictions)
# Probabilities
probabilities = algorithm.predict_proba(x_test)
print(algorithm)
print ("\n Classification report : \n",classification_report(y_test,predictions))
print ("Accuracy Score : ",accuracy_score(y_test,predictions))
#roc_auc_score
model_roc_auc = roc_auc_score(y_test,predictions)
print ("Area under curve : ",model_roc_auc,"\n")
fpr,tpr,thresholds = roc_curve(y_test,probabilities[:,1])
#print('fpr : ', fpr,'\n')
#print('tpr : ', tpr,'\n')
#print('thresholds : ', thresholds,'\n')
if threshold_plot == True :
visualizer = DiscriminationThreshold(algorithm)
visualizer.fit(X_train,y_train)
visualizer.poof()
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
if algorithm == 'randomForest':
print('Accuracy of random forest classifier on test set: {:.2f}'.format(randomForest.score(X_test, y_test)))
# Utility function to report the test scores
def report(results, n_top=3):
for i in range(1,n_top+1):
candidates = np.flatnonzero(results['rank_test_score']==i)
for candidate in candidates:
print("Model with rank : {0}".format(i))
print("Mean validation score : {0:.3f}(std: {1:.3f})".format(results['mean_test_score'][candidate],results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
print('\nRunning Logitic Regression \n')
logistic = LogisticRegression()
churn_prediction(logistic,X_train,X_test,y_train,y_test,cols,threshold_plot = True)
print('\n Logistixc Regression Finished\n')
print('\n Running Decision Tree Unpruned\n')
# Create each decision tree (pruned and unpruned)
decisionTree_unpruned = DecisionTreeClassifier()
decisionTree = DecisionTreeClassifier(max_depth = 4)
# Fit each tree to our training data
churn_prediction(decisionTree_unpruned, X_train,X_test,y_train,y_test,cols,threshold_plot=True)
#decisionTree_unpruned = decisionTree_unpruned.fit(X=train_x, y=train_y)
print('\nRunning Desicion Tree Pruned\n')
churn_prediction(decisionTree, X_train,X_test, y_train, y_test,cols,threshold_plot=True)
#decisionTree = decisionTree.fit(X=train_x, y=train_y)
print('\n Decision Tree Finished\n')
print('\n Running Random Forest \n')
randomForest = RandomForestClassifier()
churn_prediction(randomForest,X_train,X_test, y_train, y_test,cols,threshold_plot=True)
#randomForest.fit(train_x, train_y)
print('\nRandom Forest Finished\n')
#Specify parameters and distributions to sample from
param_dist = {'max_depth': [3,None],
'max_features' : sp_randint(1,11),
'min_samples_split' : sp_randint(2,11),
'bootstrap' : [True, False],
'criterion' : ['gini','entropy']}
# Run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(randomForest, param_distributions=param_dist,n_iter=n_iter_search, cv=5)
start = time()
random_search.fit(X_train, y_train)
print("Randomized Search took %.2f seconds for %d candidates " "parameter settings." % ((time()-start),n_iter_search))
report(random_search.cv_results_)
# We take the first one "Model with rank : 1" and check if the model has better performance
random_rf = RandomForestClassifier(bootstrap= False,
criterion='entropy',
max_features= 5, min_samples_split= 10)
churn_prediction(random_rf, X_train, X_test,y_train,y_test,cols,threshold_plot=True)
# Now we apply Grid Search
# Now use a full grid oerall parameters
param_grid = {"max_depth": [3, None],
"max_features": [1,3,10],
"min_samples_split":[2,3,10],
"bootstrap":[True, False],
"criterion":["gini","entropy"]}
# Run grid search
grid_search = GridSearchCV(random_rf, param_grid=param_grid, cv=5)
start = time()
grid_search.fit(X_train, y_train)
print("Grid Searchtook %.2f seconds for %d candidate parameter settings" % (time()-start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)
# We take the second one "Model with rank : 2". As the Mean validation score is
# same for all so we pick the with the less Standard Deviation.
random_gs = RandomForestClassifier(bootstrap= True, criterion= 'entropy',
max_features=10, min_samples_split=10)
churn_prediction(random_gs,X_train,X_test, y_train,y_test,cols, threshold_plot=True)
# As we can see that the accuracy for the logistic regression is the highest, i.e. 80%
# But if we see there is a problem of class imbalance
df_final['Churn'].value_counts()
# Calculate the model performance of each model.
from sklearn.metrics import f1_score
#gives model report in dataframe
def model_report(model,X_train,X_test,y_train,y_test,name) :
model.fit(X_train,y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test,predictions)
recallscore = recall_score(y_test,predictions)
precision = precision_score(y_test,predictions)
roc_auc = roc_auc_score(y_test,predictions)
f1score = f1_score(y_test,predictions)
df_model_perf = pd.DataFrame({'Model' : [name],
'Accuracy_score' : [accuracy],
'Recall_score' : [recallscore],
'Precision' : [precision],
'f1_score' : [f1score],
'Area_under_curve': [roc_auc],
})
return df_model_perf
#outputs for every model
model1 = model_report(logistic,X_train,X_test,y_train,y_test,
"Logistic Regression")
model2 = model_report(decisionTree_unpruned,X_train,X_test,y_train,y_test,
"Decision Tree Unpruned")
model3 = model_report(decisionTree_unpruned,X_train,X_test,y_train,y_test,
"Decision Tree Pruned")
model4 = model_report(randomForest,X_train,X_test,y_train,y_test,
"Random Forest Classifier")
model5 = model_report(random_rf,X_train,X_test,y_train,y_test,
"Random Forest Using<br>RandomizedSearchCV")
model6 = model_report(random_gs,X_train,X_test,y_train,y_test,
"Random Forest Usnig<br>GridSearchCV")
#concat all models
model_performances = pd.concat([model1,model2,model3,
model4,model5,model6,],axis = 0).reset_index()
model_performances = model_performances.drop(columns = "index",axis =1)
table = ff.create_table(np.round(model_performances,4))
py.iplot(table)